{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data downloaded on April 10, 2020\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "bls_jobs_data = 'https://download.bls.gov/pub/time.series/ce/ce.data.0.AllCESSeries'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "bls_df = pd.read_csv(bls_jobs_data,sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "bls_ind_code = 'https://download.bls.gov/pub/time.series/ce/ce.industry'\n",
    "ind_code_cross = pd.read_csv(bls_ind_code, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "    industry_code naics_code publishing_status  \\\n8        10211000        211                 A   \n9        10212000        212                 A   \n21       10213000        213                 A   \n24       20236000        236                 A   \n33       20237000        237                 A   \n\n                               industry_name  display_level selectable  \\\n8                     Oil and gas extraction              4          T   \n9                 Mining, except oil and gas              4          T   \n21             Support activities for mining              4          T   \n24                 Construction of buildings              4          T   \n33  Heavy and civil engineering construction              4          T   \n\n    sort_sequence  \n8               9  \n9              10  \n21             22  \n24             25  \n33             34  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>industry_code</th>\n      <th>naics_code</th>\n      <th>publishing_status</th>\n      <th>industry_name</th>\n      <th>display_level</th>\n      <th>selectable</th>\n      <th>sort_sequence</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>8</th>\n      <td>10211000</td>\n      <td>211</td>\n      <td>A</td>\n      <td>Oil and gas extraction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>10212000</td>\n      <td>212</td>\n      <td>A</td>\n      <td>Mining, except oil and gas</td>\n      <td>4</td>\n      <td>T</td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>10213000</td>\n      <td>213</td>\n      <td>A</td>\n      <td>Support activities for mining</td>\n      <td>4</td>\n      <td>T</td>\n      <td>22</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>20236000</td>\n      <td>236</td>\n      <td>A</td>\n      <td>Construction of buildings</td>\n      <td>4</td>\n      <td>T</td>\n      <td>25</td>\n    </tr>\n    <tr>\n      <th>33</th>\n      <td>20237000</td>\n      <td>237</td>\n      <td>A</td>\n      <td>Heavy and civil engineering construction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>34</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "ind_code_cross.columns = [x.strip() for x in ind_code_cross.columns]\n",
    "ind_code_cross.naics_code = ind_code_cross.naics_code.str.strip()\n",
    "\n",
    "ind_code_cross = ind_code_cross.loc[ind_code_cross.naics_code.str.len() == 3]\n",
    "ind_code_cross.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "                            series_id  value footnote_codes  industry_code  \\\nnaics_code year period                                                       \n211        2019 M01     CES1021100001  142.4            NaN       10211000   \n                M02     CES1021100001  143.6            NaN       10211000   \n                M03     CES1021100001  143.9            NaN       10211000   \n                M04     CES1021100001  146.1            NaN       10211000   \n                M05     CES1021100001  147.8            NaN       10211000   \n\n                       publishing_status           industry_name  \\\nnaics_code year period                                             \n211        2019 M01                    A  Oil and gas extraction   \n                M02                    A  Oil and gas extraction   \n                M03                    A  Oil and gas extraction   \n                M04                    A  Oil and gas extraction   \n                M05                    A  Oil and gas extraction   \n\n                        display_level selectable  sort_sequence  \nnaics_code year period                                           \n211        2019 M01                 4          T              9  \n                M02                 4          T              9  \n                M03                 4          T              9  \n                M04                 4          T              9  \n                M05                 4          T              9  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th></th>\n      <th></th>\n      <th>series_id</th>\n      <th>value</th>\n      <th>footnote_codes</th>\n      <th>industry_code</th>\n      <th>publishing_status</th>\n      <th>industry_name</th>\n      <th>display_level</th>\n      <th>selectable</th>\n      <th>sort_sequence</th>\n    </tr>\n    <tr>\n      <th>naics_code</th>\n      <th>year</th>\n      <th>period</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan=\"5\" valign=\"top\">211</th>\n      <th rowspan=\"5\" valign=\"top\">2019</th>\n      <th>M01</th>\n      <td>CES1021100001</td>\n      <td>142.4</td>\n      <td>NaN</td>\n      <td>10211000</td>\n      <td>A</td>\n      <td>Oil and gas extraction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>M02</th>\n      <td>CES1021100001</td>\n      <td>143.6</td>\n      <td>NaN</td>\n      <td>10211000</td>\n      <td>A</td>\n      <td>Oil and gas extraction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>M03</th>\n      <td>CES1021100001</td>\n      <td>143.9</td>\n      <td>NaN</td>\n      <td>10211000</td>\n      <td>A</td>\n      <td>Oil and gas extraction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>M04</th>\n      <td>CES1021100001</td>\n      <td>146.1</td>\n      <td>NaN</td>\n      <td>10211000</td>\n      <td>A</td>\n      <td>Oil and gas extraction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>M05</th>\n      <td>CES1021100001</td>\n      <td>147.8</td>\n      <td>NaN</td>\n      <td>10211000</td>\n      <td>A</td>\n      <td>Oil and gas extraction</td>\n      <td>4</td>\n      <td>T</td>\n      <td>9</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "bls_df = bls_df.loc[bls_df.year > 2018]\n",
    "bls_df.columns = [x.strip() for x in bls_df.columns]\n",
    "\n",
    "bls_df.series_id = bls_df.series_id.str.strip()\n",
    "bls_df.period = bls_df.period.str.strip()\n",
    "bls_df.footnote_codes = bls_df.footnote_codes.str.strip()\n",
    "\n",
    "# Keep just all employment (data type = 01)\n",
    "bls_df = bls_df.loc[bls_df.series_id.str[-2:] == '01']\n",
    "\n",
    "# Keep only SA\n",
    "bls_df = bls_df.loc[bls_df.series_id.str[0:3] == 'CES']\n",
    "\n",
    "# Keep only 3-digit industry code\n",
    "bls_df['industry_code'] = bls_df.series_id.str[3:11]\n",
    "bls_df['industry_code'] = bls_df['industry_code'].astype(int)\n",
    "\n",
    "bls_ind_df = pd.merge(bls_df,ind_code_cross,on='industry_code', how='right')\n",
    "#bls_df.head()\n",
    "\n",
    "bls_ind_df.set_index(['naics_code','year','period'],inplace=True)\n",
    "bls_ind_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "                 series_id   value footnote_codes  industry_code  \\\nyear period                                                        \n2020 M04     CES7072200001  6227.1            NaN       70722000   \n     M05     CES7072200001  7688.3            NaN       70722000   \n     M06     CES7072200001  9156.2            NaN       70722000   \n     M07     CES7072200001  9681.5              P       70722000   \n     M08     CES7072200001  9815.1              P       70722000   \n\n            publishing_status                      industry_name  \\\nyear period                                                        \n2020 M04                    A  Food services and drinking places   \n     M05                    A  Food services and drinking places   \n     M06                    A  Food services and drinking places   \n     M07                    A  Food services and drinking places   \n     M08                    A  Food services and drinking places   \n\n             display_level selectable  sort_sequence    ln_emp  ln_emp_diff  \nyear period                                                                  \n2020 M04                 4          T            808  8.736666    -0.628102  \n     M05                 4          T            808  8.947455     0.210789  \n     M06                 4          T            808  9.122187     0.174732  \n     M07                 4          T            808  9.177972     0.055786  \n     M08                 4          T            808  9.191677     0.013705  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th></th>\n      <th>series_id</th>\n      <th>value</th>\n      <th>footnote_codes</th>\n      <th>industry_code</th>\n      <th>publishing_status</th>\n      <th>industry_name</th>\n      <th>display_level</th>\n      <th>selectable</th>\n      <th>sort_sequence</th>\n      <th>ln_emp</th>\n      <th>ln_emp_diff</th>\n    </tr>\n    <tr>\n      <th>year</th>\n      <th>period</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th rowspan=\"5\" valign=\"top\">2020</th>\n      <th>M04</th>\n      <td>CES7072200001</td>\n      <td>6227.1</td>\n      <td>NaN</td>\n      <td>70722000</td>\n      <td>A</td>\n      <td>Food services and drinking places</td>\n      <td>4</td>\n      <td>T</td>\n      <td>808</td>\n      <td>8.736666</td>\n      <td>-0.628102</td>\n    </tr>\n    <tr>\n      <th>M05</th>\n      <td>CES7072200001</td>\n      <td>7688.3</td>\n      <td>NaN</td>\n      <td>70722000</td>\n      <td>A</td>\n      <td>Food services and drinking places</td>\n      <td>4</td>\n      <td>T</td>\n      <td>808</td>\n      <td>8.947455</td>\n      <td>0.210789</td>\n    </tr>\n    <tr>\n      <th>M06</th>\n      <td>CES7072200001</td>\n      <td>9156.2</td>\n      <td>NaN</td>\n      <td>70722000</td>\n      <td>A</td>\n      <td>Food services and drinking places</td>\n      <td>4</td>\n      <td>T</td>\n      <td>808</td>\n      <td>9.122187</td>\n      <td>0.174732</td>\n    </tr>\n    <tr>\n      <th>M07</th>\n      <td>CES7072200001</td>\n      <td>9681.5</td>\n      <td>P</td>\n      <td>70722000</td>\n      <td>A</td>\n      <td>Food services and drinking places</td>\n      <td>4</td>\n      <td>T</td>\n      <td>808</td>\n      <td>9.177972</td>\n      <td>0.055786</td>\n    </tr>\n    <tr>\n      <th>M08</th>\n      <td>CES7072200001</td>\n      <td>9815.1</td>\n      <td>P</td>\n      <td>70722000</td>\n      <td>A</td>\n      <td>Food services and drinking places</td>\n      <td>4</td>\n      <td>T</td>\n      <td>808</td>\n      <td>9.191677</td>\n      <td>0.013705</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "# Log monthly difference\n",
    "bls_ind_df['ln_emp'] = np.log(bls_ind_df.value)\n",
    "bls_ind_df['ln_emp_diff'] = bls_ind_df.ln_emp.diff()\n",
    "\n",
    "bls_ind_df.xs('722').tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "   naics_code  ln_emp_diff\n0         211    -0.001278\n1         212    -0.004783\n2         213    -0.023811\n3         236    -0.006653\n4         237    -0.014205",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>naics_code</th>\n      <th>ln_emp_diff</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>211</td>\n      <td>-0.001278</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>212</td>\n      <td>-0.004783</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>213</td>\n      <td>-0.023811</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>236</td>\n      <td>-0.006653</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>237</td>\n      <td>-0.014205</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "naics_wgts = bls_ind_df.xs([2020,'M03'],level=[1,2])[['ln_emp_diff']].reset_index()\n",
    "\n",
    "naics_wgts['naics_code'] = naics_wgts['naics_code'].astype(int)\n",
    "naics_wgts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "count    76.000000\nmean     -0.007758\nstd       0.012093\nmin      -0.052838\n25%      -0.011650\n50%      -0.004824\n75%      -0.001284\nmax       0.017665\nName: ln_emp_diff, dtype: float64"
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "naics_wgts.ln_emp_diff.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get State Data from QCEW (Processed in QCEW Notebook)\n",
    "df_state_ind_all = pd.read_csv('../../data/bls/state_emp_ind.csv')\n",
    "df_state_ind_all.head()\n",
    "\n",
    "df_state_ind_all['naics_code'] = df_state_ind_all['naics_code'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "   area_fips  naics_code  annual_avg_emplvl  STATE  ln_emp_diff  Total_Emp  \\\n0       1000         211                411    1.0    -0.001278    1492707   \n1       2000         211               3522    2.0    -0.001278     225543   \n2       4000         211                 80    4.0    -0.001278    2168037   \n3       5000         211                675    5.0    -0.001278     922683   \n4       6000         211               4734    6.0    -0.001278   12694739   \n\n    emp_wgt  emp_diff_times_share  \n0  0.000275         -3.518706e-07  \n1  0.015616         -1.995610e-05  \n2  0.000037         -4.715622e-08  \n3  0.000732         -9.349039e-07  \n4  0.000373         -4.765629e-07  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>area_fips</th>\n      <th>naics_code</th>\n      <th>annual_avg_emplvl</th>\n      <th>STATE</th>\n      <th>ln_emp_diff</th>\n      <th>Total_Emp</th>\n      <th>emp_wgt</th>\n      <th>emp_diff_times_share</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1000</td>\n      <td>211</td>\n      <td>411</td>\n      <td>1.0</td>\n      <td>-0.001278</td>\n      <td>1492707</td>\n      <td>0.000275</td>\n      <td>-3.518706e-07</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2000</td>\n      <td>211</td>\n      <td>3522</td>\n      <td>2.0</td>\n      <td>-0.001278</td>\n      <td>225543</td>\n      <td>0.015616</td>\n      <td>-1.995610e-05</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>4000</td>\n      <td>211</td>\n      <td>80</td>\n      <td>4.0</td>\n      <td>-0.001278</td>\n      <td>2168037</td>\n      <td>0.000037</td>\n      <td>-4.715622e-08</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>5000</td>\n      <td>211</td>\n      <td>675</td>\n      <td>5.0</td>\n      <td>-0.001278</td>\n      <td>922683</td>\n      <td>0.000732</td>\n      <td>-9.349039e-07</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>6000</td>\n      <td>211</td>\n      <td>4734</td>\n      <td>6.0</td>\n      <td>-0.001278</td>\n      <td>12694739</td>\n      <td>0.000373</td>\n      <td>-4.765629e-07</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 11
    }
   ],
   "source": [
    "bls_state_ind_df = pd.merge(df_state_ind_all,naics_wgts,on='naics_code',how='right')\n",
    "\n",
    "bls_state_ind_df['Total_Emp'] = bls_state_ind_df.groupby('area_fips')['annual_avg_emplvl'].transform('sum')\n",
    "\n",
    "bls_state_ind_df['emp_wgt'] = bls_state_ind_df.annual_avg_emplvl/bls_state_ind_df.Total_Emp\n",
    "\n",
    "bls_state_ind_df['emp_diff_times_share'] = bls_state_ind_df['emp_wgt']*bls_state_ind_df['ln_emp_diff']\n",
    "\n",
    "bls_state_ind_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "state_bartik_df = bls_state_ind_df.groupby('STATE')[['emp_diff_times_share']].sum().reset_index()\n",
    "\n",
    "state_bartik_df.STATE = state_bartik_df.STATE.astype(int)\n",
    "state_bartik_df.head()\n",
    "state_bartik_df.to_csv('../../data/bls/state_bartiks.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}